import pandas as pd
import numpy as np
from sklearn import svm
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.svm import svc
def getAccuracy(testSet, predictions):
correct = 0
for x in range(len(testSet)):
if testSet[x]== predictions[x]:
correct += 1
return (correct/float(len(testSet))) * 100.0
vehicle_df = pd.read_csv("vehicle-1.csv", header = 1)
vehicle_df.head(10)
vehicle_df.dtypes
vehicle_df.shape
vehicle_df.describe().transpose()
All columns except 'class' are described above. class is the target variable. Except for compactness, max.length_aspect_ratio, max.lenght_rectancgularity and hollows_ratio, all variables have missing values.
sns.countplot(vehicle_df['class'])
vehicle_df = vehicle_df.fillna(vehicle_df.median())
vehicle_df.describe().transpose()
The missing values have been now replaced by the median of those variable. there are no missing values in the data set now.
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
vehicle_df['class']= label_encoder.fit_transform(vehicle_df['class'])
vehicle_df['class'].unique()
sns.countplot(vehicle_df['class'])
vehicle_df.info()
sns.boxplot(vehicle_df['compactness'], fliersize=2, orient='v')
Q1=vehicle_df['compactness'].quantile(q=0.25)
Q3=vehicle_df['compactness'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in circularity: ', L_outliers)
print('Upper outliers in circularity: ', U_outliers)
sns.boxplot(vehicle_df['circularity'], fliersize=2, orient='v')
Q1=vehicle_df['circularity'].quantile(q=0.25)
Q3=vehicle_df['circularity'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in circularity: ', L_outliers)
print('Upper outliers in circularity: ', U_outliers)
sns.boxplot(vehicle_df['distance_circularity'], fliersize=2, orient='v')
Q1=vehicle_df['distance_circularity'].quantile(q=0.25)
Q3=vehicle_df['distance_circularity'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in distance_circularity: ', L_outliers)
print('Upper outliers in distance_circularity: ', U_outliers)
sns.boxplot(vehicle_df['radius_ratio'], fliersize=2, orient='v')
Q1=vehicle_df['radius_ratio'].quantile(q=0.25)
Q3=vehicle_df['radius_ratio'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in radius_ratio: ', L_outliers)
print('Upper outliers in radius_ratio: ', U_outliers)
sns.boxplot(vehicle_df['pr.axis_aspect_ratio'], fliersize=2, orient='v')
Q1=vehicle_df['pr.axis_aspect_ratio'].quantile(q=0.25)
Q3=vehicle_df['pr.axis_aspect_ratio'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in pr.axis_aspect_ratio: ', L_outliers)
print('Upper outliers in pr.axis_aspect_ratio: ', U_outliers)
sns.boxplot(vehicle_df['max.length_aspect_ratio'], fliersize=2, orient='v')
Q1=vehicle_df['max.length_aspect_ratio'].quantile(q=0.25)
Q3=vehicle_df['max.length_aspect_ratio'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in max.length_aspect_ratio: ', L_outliers)
print('Upper outliers in max.length_aspect_ratio: ', U_outliers)
sns.boxplot(vehicle_df['scatter_ratio'], fliersize=2, orient='v')
Q1=vehicle_df['scatter_ratio'].quantile(q=0.25)
Q3=vehicle_df['scatter_ratio'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in scatter_ratio: ', L_outliers)
print('Upper outliers in scatter_ratio: ', U_outliers)
sns.boxplot(vehicle_df['elongatedness'], fliersize=2, orient='v')
Q1=vehicle_df['elongatedness'].quantile(q=0.25)
Q3=vehicle_df['elongatedness'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in elongatedness: ', L_outliers)
print('Upper outliers in elongatedness: ', U_outliers)
sns.boxplot(vehicle_df['pr.axis_rectangularity'], fliersize=2, orient='v')
Q1=vehicle_df['pr.axis_rectangularity'].quantile(q=0.25)
Q3=vehicle_df['pr.axis_rectangularity'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in pr.axis_rectangularity: ', L_outliers)
print('Upper outliers in pr.axis_rectangularity: ', U_outliers)
sns.boxplot(vehicle_df['max.length_rectangularity'], fliersize=2, orient='v')
Q1=vehicle_df['max.length_rectangularity'].quantile(q=0.25)
Q3=vehicle_df['max.length_rectangularity'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in max.length_rectangularity: ', L_outliers)
print('Upper outliers in max.length_rectangularity: ', U_outliers)
sns.boxplot(vehicle_df['scaled_variance'], fliersize=2, orient='v')
Q1=vehicle_df['scaled_variance'].quantile(q=0.25)
Q3=vehicle_df['scaled_variance'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in scaled_variance: ', L_outliers)
print('Upper outliers in scaled_variance: ', U_outliers)
sns.boxplot(vehicle_df['scaled_variance.1'], fliersize=2, orient='v')
Q1=vehicle_df['scaled_variance.1'].quantile(q=0.25)
Q3=vehicle_df['scaled_variance.1'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in scaled_variance.1: ', L_outliers)
print('Upper outliers in scaled_variance.1: ', U_outliers)
sns.boxplot(vehicle_df['scaled_radius_of_gyration'], fliersize=2, orient='v')
Q1=vehicle_df['scaled_radius_of_gyration'].quantile(q=0.25)
Q3=vehicle_df['scaled_radius_of_gyration'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in scaled_radius_of_gyration: ', L_outliers)
print('Upper outliers in scaled_radius_of_gyration: ', U_outliers)
sns.boxplot(vehicle_df['scaled_radius_of_gyration.1'], fliersize=2, orient='v')
Q1=vehicle_df['scaled_radius_of_gyration.1'].quantile(q=0.25)
Q3=vehicle_df['scaled_radius_of_gyration.1'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in scaled_radius_of_gyration.1: ', L_outliers)
print('Upper outliers in scaled_radius_of_gyration.1: ', U_outliers)
sns.boxplot(vehicle_df['skewness_about'], fliersize=2, orient='v')
Q1=vehicle_df['skewness_about'].quantile(q=0.25)
Q3=vehicle_df['skewness_about'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in skewness_about: ', L_outliers)
print('Upper outliers in skewness_about: ', U_outliers)
sns.boxplot(vehicle_df['skewness_about.1'], fliersize=2, orient='v')
Q1=vehicle_df['skewness_about.1'].quantile(q=0.25)
Q3=vehicle_df['skewness_about.1'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in skewness_about.1: ', L_outliers)
print('Upper outliers in skewness_about.1: ', U_outliers)
sns.boxplot(vehicle_df['skewness_about.2'], fliersize=2, orient='v')
Q1=vehicle_df['skewness_about.2'].quantile(q=0.25)
Q3=vehicle_df['skewness_about.2'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in skewness_about.2: ', L_outliers)
print('Upper outliers in skewness_about.2: ', U_outliers)
sns.boxplot(vehicle_df['hollows_ratio'], fliersize=2, orient='v')
Q1=vehicle_df['hollows_ratio'].quantile(q=0.25)
Q3=vehicle_df['hollows_ratio'].quantile(q=0.75)
L_outliers=Q1-1.5*(Q3-Q1)
U_outliers=Q3+1.5*(Q3-Q1)
print('Lower outliers in hollows_ratio: ', L_outliers)
print('Upper outliers in hollows_ratio: ', U_outliers)
vehicle_df1 = vehicle_df
for col_name in vehicle_df1.columns[:-1]:
q1 = vehicle_df1[col_name].quantile(0.25)
q3 = vehicle_df1[col_name].quantile(0.75)
iqr = q3 - q1
low = q1-1.5*iqr
high = q3+1.5*iqr
vehicle_df1.loc[(vehicle_df1[col_name] < low) | (vehicle_df1[col_name] > high), col_name] = vehicle_df1[col_name].median()
vehicle_df1.boxplot(figsize=(40,20))
sns.pairplot(vehicle_df1,hue='class', diag_kind='kde')
cor=vehicle_df1.corr()
cor
sns.set(font_scale=1.15)
plt.figure(figsize=(20,15))
sns.heatmap(cor, vmax=.8, linewidths=0.01,
square=True,annot=True,cmap="coolwarm",linecolor="black")
plt.title('Correlation between features');
vehicle_df1.shape
vehicle_df1.groupby(["class"]).count()
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
array = vehicle_df1.values
X = array[:,0:18]
Y = array[:,18]
le = preprocessing.LabelEncoder()
le.fit(Y)
test_size = 0.30 # taking 70:30 training and test set
seed = 7 # Random numbmer seeding for reapeatability of the code
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)
type(X_train)
clf = svm.SVC(gamma= 0.0025, C=3)
clf.fit(X_train , y_train)
y_pred = clf.predict(X_test)
SVM_Accuracy = getAccuracy(y_test , y_pred) # Calling getAccuracy function instead of usinig array comparison to get the %age accuracy
SVM_Accuracy
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
vehiclekfold_df = vehicle_df1
array = vehiclekfold_df.values
X = array[:,0:18]
y = array[:,18]
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.50, random_state=1)
num_folds = 50
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
results1 = cross_val_score(clf,X, y, cv=kfold)
accuracy=np.mean(abs(results1))
print('Average accuracy: ',accuracy)
print('Standard Deviation: ',results1.std())
covMatrix = np.cov(X,rowvar=False)
print(covMatrix)
pca = PCA(n_components=6)
pca.fit(X)
print(pca.explained_variance_)
print(pca.components_)
print(pca.explained_variance_ratio_)
plt.bar(list(range(1,7)),pca.explained_variance_ratio_,alpha=0.5, align='center')
plt.ylabel('Variation explained')
plt.xlabel('eigen Value')
plt.show()
plt.step(list(range(1,7)),np.cumsum(pca.explained_variance_ratio_), where='mid')
plt.ylabel('Cum of variation explained')
plt.xlabel('eigen Value')
plt.show()
pca3 = PCA(n_components=3)
pca3.fit(X)
print(pca3.components_)
print(pca3.explained_variance_ratio_)
Xpca3 = pca3.transform(X)
sns.pairplot(pd.DataFrame(Xpca3))
SVM_model = SVC()
SVM_model.fit(Xpca3, Y)
SVM_model.score(Xpca3, Y)